library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
baseball<-read.csv("baseball.csv")
#creating a new variable in our dataset for specified condition named *condition*: binary indicator variable indicating runner on 3rd with less than 2 outs
data <- baseball %>%
mutate(condition = ifelse(outs_when_up < 2 & (!is.na(on_3b)), 1, 0))
#baseline filtering out of bunts/incorrectly recorded data
#assuming > 40mph is reasonable to assume, removing NA values
data <- data %>%
filter(bat_speed > 40) %>%
filter(bat_speed != "NA")
#create new variable for average bat speed of each player
#create new variable for deviance of each swing compared to individual player's averages
#create new variable that records the standard deviation of each player's bat speed
#create new variable that indicates speed three standard deviations above and below a player's average bat speed
data <- data %>%
group_by(player_name) %>%
mutate(avg_bat_speed = mean(bat_speed, na.rm = TRUE),
deviance = ifelse(!is.na(bat_speed), bat_speed - avg_bat_speed, NA),
sd_bat_speed = sd(bat_speed, na.rm = TRUE),
three_sd_above = avg_bat_speed + 3 * sd_bat_speed,
three_sd_below = avg_bat_speed - 3 * sd_bat_speed
) %>%
ungroup()
#filters out unreasonable outliers (laying outside the 3-standard deviation range) of each individual player's average bat speed.
data <- data %>% filter(bat_speed < three_sd_above) %>% filter(bat_speed > three_sd_below)
#modeling bat speed and swing length
library(lme4)
## Loading required package: Matrix
#utilizing (1 | player_name) as a random effect to account for player-to-player variability in bat_speed
#allows us to more robustly observe the effect of the condition on bat speed
bat_speed_model <- lmer(
bat_speed ~ condition + (1 | player_name),
data = data
)
#utilizing (1 | player_name) as a random effect to account for player-to-player variability in swing_length
#allows us to more robustly observe the effect of the condition on swing length
swing_length_model <- lmer(
swing_length ~ condition + (1 | player_name),
data = data
)
#read in results
summary(bat_speed_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: bat_speed ~ condition + (1 | player_name)
## Data: data
##
## REML criterion at convergence: 1852713
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.3557 -0.5542 0.1048 0.6579 3.4541
##
## Random effects:
## Groups Name Variance Std.Dev.
## player_name (Intercept) 7.496 2.738
## Residual 22.546 4.748
## Number of obs: 310705, groups: player_name, 647
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 70.31310 0.10923 643.74
## condition -0.51477 0.03907 -13.17
##
## Correlation of Fixed Effects:
## (Intr)
## condition -0.018
summary(swing_length_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: swing_length ~ condition + (1 | player_name)
## Data: data
##
## REML criterion at convergence: 727320.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.8935 -0.6808 0.0304 0.7274 6.3042
##
## Random effects:
## Groups Name Variance Std.Dev.
## player_name (Intercept) 0.1583 0.3979
## Residual 0.6029 0.7764
## Number of obs: 310705, groups: player_name, 647
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 7.244184 0.015925 454.882
## condition -0.043433 0.006389 -6.798
##
## Correlation of Fixed Effects:
## (Intr)
## condition -0.020
#function to assist visualization in further steps
create_circle <- function(center_x, center_y, radius, n_points = 100) {
theta <- seq(0, 2 * pi, length.out = n_points)
data.frame(
x = center_x + radius * cos(theta),
y = center_y + radius * sin(theta)
)
}
library(ggplot2)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
#will help us view the highest density of high run expectancies in the data
filtered_data <- subset(data, delta_run_exp > 2.5)
#pinpoint location of the highest density of run expectancies over 2.5
density_estimate <- kde2d(
x = filtered_data$deviance,
y = filtered_data$delta_run_exp,
n = 100
)
#steps to create circle to assist visualization aid
max_density_idx <- which(density_estimate$z == max(density_estimate$z), arr.ind = TRUE)
center_x <- density_estimate$x[max_density_idx[1]]
center_y <- density_estimate$y[max_density_idx[2]]
circle_data <- create_circle(center_x, center_y, radius = 0.5)
#plots bat speed deviance against run expectancies, adding a circle to the location of the highest density of run expectancies of 2.5 -- allows us to visually determine strategy
ggplot(data, aes(x = deviance, y = delta_run_exp, color = inning)) +
geom_point(alpha = 0.7) +
geom_polygon(data = circle_data, aes(x = x, y = y), inherit.aes = FALSE,
color = "red", fill = NA, size = 1) +
labs(
title = "Relationship Between Bat Speed Deviance and Run Expectancy",
x = "Deviance from Average Bat Speed",
y = "Run Expectancy"
) + # Preserve aspect ratio
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

#view the bat speed plotted against location of the hit -- again, allows us to examine strategy possibilities
ggplot(data, aes(x = hc_x, y = hc_y, color = bat_speed)) +
geom_point(alpha = 0.7, size = 3) +
scale_color_viridis_c(option = "plasma", name = "Bat Speed") +
labs(
title = "Hit Coordinates Colored by Bat Speed",
x = "Hit Coordinate X (hc_x)",
y = "Hit Coordinate Y (hc_y)"
) +
theme_minimal()
## Warning: Removed 196266 rows containing missing values or values outside the scale range
## (`geom_point()`).

#same plots, specified for condition (runner on 3rd, less than 2 outs)
data3 <- data %>% filter(
outs_when_up < 2 & (!is.na(on_3b))
)
filtered_data3 <- subset(data3, delta_run_exp > 2.5)
density_estimate3 <- kde2d(
x = filtered_data3$deviance,
y = filtered_data3$delta_run_exp,
n = 100
)
max_density_idx3 <- which(density_estimate3$z == max(density_estimate3$z), arr.ind = TRUE)
center_x3 <- density_estimate3$x[max_density_idx3[1]]
center_y3 <- density_estimate3$y[max_density_idx3[2]]
circle_data3 <- create_circle(center_x3, center_y3, radius = 0.5)
ggplot(data3, aes(x = deviance, y = delta_run_exp, color = inning)) +
geom_point(alpha = 0.7) +
geom_polygon(data = circle_data3, aes(x = x, y = y), inherit.aes = FALSE,
color = "red", fill = NA, size = 1) +
labs(
title = "Relationship Between Bat Speed Deviance and Run Expectancy",
x = "Deviance from Average Bat Speed",
y = "Run Expectancy"
) + # Preserve aspect ratio
theme_minimal()

ggplot(data3, aes(x = hc_x, y = hc_y, color = bat_speed)) +
geom_point(alpha = 0.7, size = 3) +
scale_color_viridis_c(option = "plasma", name = "Bat Speed") +
labs(
title = "Hit Coordinates Colored by Bat Speed",
x = "Hit Coordinate X (hc_x)",
y = "Hit Coordinate Y (hc_y)"
) +
theme_minimal()
## Warning: Removed 9875 rows containing missing values or values outside the scale range
## (`geom_point()`).

#same as above, for win expectancies
#take absolute value, as win expectancies are negative for away teams
data <- data %>% mutate(delta_home_win_exp = abs(delta_home_win_exp))
filtered_data <- subset(data, delta_home_win_exp > .2)
#pinpoint location of the highest density of win expectancies over .2
density_estimate <- kde2d(
x = filtered_data$deviance,
y = filtered_data$delta_home_win_exp,
n = 100
)
#steps to create circle to assist visualization aid
max_density_idx <- which(density_estimate$z == max(density_estimate$z), arr.ind = TRUE)
center_x <- density_estimate$x[max_density_idx[1]]
center_y <- density_estimate$y[max_density_idx[2]]
circle_data <- create_circle(center_x, center_y, radius = 0.3)
#plots bat speed deviance against run expectancies, adding a circle to the location of the highest density of win expectancies of >.2 -- allows us to visually determine strategy
ggplot(data, aes(x = deviance, y = delta_home_win_exp, color = inning)) +
geom_point(alpha = 0.7) +
geom_polygon(data = circle_data, aes(x = x, y = y), inherit.aes = FALSE,
color = "red", fill = NA, size = 1) +
labs(
title = "Relationship Between Bat Speed Deviance and Run Expectancy",
x = "Deviance from Average Bat Speed",
y = "Run Expectancy"
) +
theme_minimal()

#same plots, specified for condition (runner on 3rd, less than 2 outs)
data3 <- data %>% filter(
outs_when_up < 2 & (!is.na(on_3b))
)
filtered_data3 <- subset(data3, delta_home_win_exp > .2)
density_estimate3 <- kde2d(
x = filtered_data3$deviance,
y = filtered_data3$delta_home_win_exp,
n = 100
)
max_density_idx3 <- which(density_estimate3$z == max(density_estimate3$z), arr.ind = TRUE)
center_x3 <- density_estimate3$x[max_density_idx3[1]]
center_y3 <- density_estimate3$y[max_density_idx3[2]]
circle_data3 <- create_circle(center_x3, center_y3, radius = 0.3)
ggplot(data3, aes(x = deviance, y = delta_home_win_exp, color = inning)) +
geom_point(alpha = 0.7) +
geom_polygon(data = circle_data3, aes(x = x, y = y), inherit.aes = FALSE,
color = "red", fill = NA, size = 1) +
labs(
title = "Relationship Between Bat Speed Deviance and Win Expectancy",
x = "Deviance from Average Bat Speed",
y = "Run Expectancy"
) +
theme_minimal()

#creating a new variable in our dataset for specified condition named *condition*: binary indicator variable indicating 2 strikes and runners on bases.
data <- baseball %>%
mutate(condition = ifelse(strikes == 2 & (!is.na(on_1b) | !is.na(on_2b) | !is.na(on_3b)), 1, 0))
data <- data %>%
filter(bat_speed > 40) %>%
filter(bat_speed != "NA")
data <- data %>%
group_by(player_name) %>%
mutate(avg_bat_speed = mean(bat_speed, na.rm = TRUE),
deviance = ifelse(!is.na(bat_speed), bat_speed - avg_bat_speed, NA),
sd_bat_speed = sd(bat_speed, na.rm = TRUE),
three_sd_above = avg_bat_speed + 3 * sd_bat_speed,
three_sd_below = avg_bat_speed - 3 * sd_bat_speed
) %>%
ungroup()
data <- data %>% filter(bat_speed < three_sd_above) %>% filter(bat_speed > three_sd_below)
bat_speed_model <- lmer(
bat_speed ~ condition + (1 | player_name),
data = data
)
swing_length_model <- lmer(
swing_length ~ condition + (1 | player_name),
data = data
)
summary(bat_speed_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: bat_speed ~ condition + (1 | player_name)
## Data: data
##
## REML criterion at convergence: 1848168
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.4251 -0.5534 0.1026 0.6558 3.6525
##
## Random effects:
## Groups Name Variance Std.Dev.
## player_name (Intercept) 7.503 2.739
## Residual 22.218 4.714
## Number of obs: 310705, groups: player_name, 647
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 70.54120 0.10930 645.39
## condition -1.60224 0.02323 -68.96
##
## Correlation of Fixed Effects:
## (Intr)
## condition -0.034
summary(swing_length_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: swing_length ~ condition + (1 | player_name)
## Data: data
##
## REML criterion at convergence: 726681.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.7868 -0.6850 0.0311 0.7253 6.2946
##
## Random effects:
## Groups Name Variance Std.Dev.
## player_name (Intercept) 0.1583 0.3979
## Residual 0.6016 0.7756
## Number of obs: 310705, groups: player_name, 647
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 7.257893 0.015933 455.5
## condition -0.100167 0.003823 -26.2
##
## Correlation of Fixed Effects:
## (Intr)
## condition -0.038
create_circle <- function(center_x, center_y, radius, n_points = 100) {
theta <- seq(0, 2 * pi, length.out = n_points)
data.frame(
x = center_x + radius * cos(theta),
y = center_y + radius * sin(theta)
)
}
library(ggplot2)
library(MASS)
filtered_data <- subset(data, delta_run_exp > 2.5)
density_estimate <- kde2d(
x = filtered_data$deviance,
y = filtered_data$delta_run_exp,
n = 100
)
max_density_idx <- which(density_estimate$z == max(density_estimate$z), arr.ind = TRUE)
center_x <- density_estimate$x[max_density_idx[1]]
center_y <- density_estimate$y[max_density_idx[2]]
circle_data <- create_circle(center_x, center_y, radius = 0.5)
ggplot(data, aes(x = deviance, y = delta_run_exp, color = inning)) +
geom_point(alpha = 0.7) +
geom_polygon(data = circle_data, aes(x = x, y = y), inherit.aes = FALSE,
color = "red", fill = NA, size = 1) +
labs(
title = "Relationship Between Bat Speed Deviance and Run Expectancy",
x = "Deviance from Average Bat Speed",
y = "Run Expectancy"
) + # Preserve aspect ratio
theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(data, aes(x = hc_x, y = hc_y, color = bat_speed)) +
geom_point(alpha = 0.7, size = 3) +
scale_color_viridis_c(option = "plasma", name = "Bat Speed") +
labs(
title = "Hit Coordinates Colored by Bat Speed",
x = "Hit Coordinate X (hc_x)",
y = "Hit Coordinate Y (hc_y)"
) +
theme_minimal()
## Warning: Removed 196266 rows containing missing values or values outside the scale range
## (`geom_point()`).

data3 <- data %>% filter(
outs_when_up < 2 & (!is.na(on_3b))
)
filtered_data3 <- subset(data3, delta_run_exp > 2.5)
density_estimate3 <- kde2d(
x = filtered_data3$deviance,
y = filtered_data3$delta_run_exp,
n = 100
)
max_density_idx3 <- which(density_estimate3$z == max(density_estimate3$z), arr.ind = TRUE)
center_x3 <- density_estimate3$x[max_density_idx3[1]]
center_y3 <- density_estimate3$y[max_density_idx3[2]]
circle_data3 <- create_circle(center_x3, center_y3, radius = 0.5)
ggplot(data3, aes(x = deviance, y = delta_run_exp, color = inning)) +
geom_point(alpha = 0.7) +
geom_polygon(data = circle_data3, aes(x = x, y = y), inherit.aes = FALSE,
color = "red", fill = NA, size = 1) +
labs(
title = "Relationship Between Bat Speed Deviance and Run Expectancy",
x = "Deviance from Average Bat Speed",
y = "Run Expectancy"
) + # Preserve aspect ratio
theme_minimal()

ggplot(data3, aes(x = hc_x, y = hc_y, color = bat_speed)) +
geom_point(alpha = 0.7, size = 3) +
scale_color_viridis_c(option = "plasma", name = "Bat Speed") +
labs(
title = "Hit Coordinates Colored by Bat Speed",
x = "Hit Coordinate X (hc_x)",
y = "Hit Coordinate Y (hc_y)"
) +
theme_minimal()
## Warning: Removed 9875 rows containing missing values or values outside the scale range
## (`geom_point()`).

data <- data %>% mutate(delta_home_win_exp = abs(delta_home_win_exp))
filtered_data <- subset(data, delta_home_win_exp > .2)
density_estimate <- kde2d(
x = filtered_data$deviance,
y = filtered_data$delta_home_win_exp,
n = 100
)
max_density_idx <- which(density_estimate$z == max(density_estimate$z), arr.ind = TRUE)
center_x <- density_estimate$x[max_density_idx[1]]
center_y <- density_estimate$y[max_density_idx[2]]
circle_data <- create_circle(center_x, center_y, radius = 0.3)
ggplot(data, aes(x = deviance, y = delta_home_win_exp, color = inning)) +
geom_point(alpha = 0.7) +
geom_polygon(data = circle_data, aes(x = x, y = y), inherit.aes = FALSE,
color = "red", fill = NA, size = 1) +
labs(
title = "Relationship Between Bat Speed Deviance and Run Expectancy",
x = "Deviance from Average Bat Speed",
y = "Run Expectancy"
) +
theme_minimal()

data3 <- data %>% filter(
outs_when_up < 2 & (!is.na(on_3b))
)
filtered_data3 <- subset(data3, delta_home_win_exp > .2)
density_estimate3 <- kde2d(
x = filtered_data3$deviance,
y = filtered_data3$delta_home_win_exp,
n = 100
)
max_density_idx3 <- which(density_estimate3$z == max(density_estimate3$z), arr.ind = TRUE)
center_x3 <- density_estimate3$x[max_density_idx3[1]]
center_y3 <- density_estimate3$y[max_density_idx3[2]]
circle_data3 <- create_circle(center_x3, center_y3, radius = 0.3)
ggplot(data3, aes(x = deviance, y = delta_home_win_exp, color = inning)) +
geom_point(alpha = 0.7) +
geom_polygon(data = circle_data3, aes(x = x, y = y), inherit.aes = FALSE,
color = "red", fill = NA, size = 1) +
labs(
title = "Relationship Between Bat Speed Deviance and Win Expectancy",
x = "Deviance from Average Bat Speed",
y = "Run Expectancy"
) +
theme_minimal()
